Abstract
Here we will use the EPIC Methylation Array data from Human Pancreatic Islets from individuals with and without Type 2 Diabetes (T2D) and perform diebetes status linear predictive analysis with PLS-DA model on 110 manually selected individuals with most reliable phenotypic information.
All the analysis will be performed on the 110 individuals whic were selected based on multiple criteria such as 1) they are either normo-glycimic or hyper-glycemic / T2D individuals, 2) they have information from all the 4 OMICS (methylation, transcriptomics, phenotypes, genotypes), 3) they all fall within the same age category, and a few other minor criteria. Now we will read the list of those 110 individuals and display a few of them:
set.seed(1)
selected_ind<-scan("OVERLAPPING_110_SAMPLES_4OMICS.txt", what = "charater")
selected_ind<-paste0("ID",selected_ind)
print(head(selected_ind, 20))
## [1] "ID1" "ID3" "ID4" "ID6" "ID8" "ID10" "ID14" "ID19" "ID21" "ID30"
## [11] "ID32" "ID34" "ID35" "ID36" "ID38" "ID39" "ID44" "ID46" "ID55" "ID58"
print(tail(selected_ind, 20))
## [1] "ID136" "ID156" "ID160" "ID165" "ID168" "ID172" "ID175" "ID182"
## [9] "ID183" "ID191" "ID194" "ID200" "ID209" "ID210" "ID214" "ID217"
## [17] "ID221" "ID228" "ID263" "ID273"
We will start with the analysis of phenotypes available from approximately 300 Human Panctreatic Islets (but concentrate only on the 110 selected individuals) in order to check the relattion between the phenotypes. The phenotype file below was provided by Human Tissue Lab (HTL) from Lund University Diabetes Center (LUDC), it includes phenotypic information from approximately 300 pancreatic islets donors. We start with reading the file and removing non-relevant phenotypes:
We will also add information about Methylation EPIC Array batches and RNAseq batches (old vs. new chemistry) in order to see what phenotypes the batches correlate with. This is a potential danger if the batches happen to correlate with valuables phenotypes like T2D or HbA1c. As a control of noise level we will introduce a random variable, we will use it to check later how much of phenotypic variation can be explained by chance.
htl<-read.csv("htl_donors.csv",header=TRUE,check.names=FALSE)
htl$HTL_donor_ID<-paste0("ID",htl$HTL_donor_ID)
phen_meth_batch<-read.delim("phenotypes_islets.txt",header=TRUE,row.names=1,check.names=FALSE,sep="\t")
htl$Meth_Batch<-phen_meth_batch$batch[match(as.character(htl$HTL_donor_ID),rownames(phen_meth_batch))]
htl$Meth_Batch[is.na(htl$Meth_Batch)]<-21
phen_rnaseq_batch<-read.delim("phen_rnaseq_batch.txt",header=TRUE,row.names=1,check.names=FALSE,sep="\t")
htl$RNA_Batch<-phen_rnaseq_batch$batch[match(as.character(htl$HTL_donor_ID),rownames(phen_rnaseq_batch))]
htl$Random<-log(sample(dim(htl)[1]))
rownames(htl)<-htl$HTL_donor_ID
htl$HTL_donor_ID<-NULL
htl$NICS_donor_ID<-NULL
htl$Birth<-NULL
htl$HbA1c_mmol_mol<-NULL
htl$Blood_group<-NULL
htl$Non_diabetic<-NULL
htl$GAD<-NULL
htl$Gestational<-NULL
htl$Date_isolated_islets<-NULL
htl$Date_recieved_islets<-NULL
htl$Islet_equivalences<-NULL
htl$Diabetes_treatment<-NULL
head(htl)
## Gender Age BMI HbA1c_perc T2D Stimulatory_index Purity_perc
## ID1 Male 68 24.7 5.8 0 9.6 75
## ID2 Male 55 24.7 NA 0 10.2 90
## ID3 Female 45 23.9 5.4 0 3.6 70
## ID4 Female 61 27.7 NA 0 14.9 80
## ID5 Female 69 17.6 NA 0 10.9 37
## ID6 Male 67 28.4 5.5 0 1.9 95
## Days_cultured Meth_Batch RNA_Batch Random
## ID1 5 1 old_chem 5.117994
## ID2 8 21 old_chem 4.859812
## ID3 3 1 old_chem 5.598422
## ID4 5 21 old_chem 5.231109
## ID5 2 21 old_chem 4.442651
## ID6 2 1 old_chem 5.624018
For convenience we will rename HbA1c_perc to HbA1c, Stimulatory_index to SI, Purity_perc to Purity, Days_cultured to DIC. We will also assign donors with HbA1c > 6.5 to diabetics and balance the data set a bit better in this way.
names(htl)[names(htl)=="HbA1c_perc"]<-"HbA1c"
names(htl)[names(htl)=="Stimulatory_index"]<-"SI"
names(htl)[names(htl)=="Purity_perc"]<-"Purity"
names(htl)[names(htl)=="Days_cultured"]<-"DIC"
htl$T2D<-ifelse( ((is.na(htl$T2D)==FALSE & htl$T2D==1) | (is.na(htl$HbA1c)==FALSE & htl$HbA1c>6.5)), 1, 0)
head(htl)
## Gender Age BMI HbA1c T2D SI Purity DIC Meth_Batch RNA_Batch
## ID1 Male 68 24.7 5.8 0 9.6 75 5 1 old_chem
## ID2 Male 55 24.7 NA 0 10.2 90 8 21 old_chem
## ID3 Female 45 23.9 5.4 0 3.6 70 3 1 old_chem
## ID4 Female 61 27.7 NA 0 14.9 80 5 21 old_chem
## ID5 Female 69 17.6 NA 0 10.9 37 2 21 old_chem
## ID6 Male 67 28.4 5.5 0 1.9 95 2 1 old_chem
## Random
## ID1 5.117994
## ID2 4.859812
## ID3 5.598422
## ID4 5.231109
## ID5 4.442651
## ID6 5.624018
Now let us assign “numeric” or “factor” status to the variables. For simplicity, we will treat a variable with less than 2 factor levels as “factor” data type, and else as “numeric” data type. DIC and Meth_Batch can also be considered as factors, however in this case we will need to do a multinomial (not a binomial/logistic) regression which is cumbersome, not clear how to extract adjusted R squared info, and most likely will not bring drammatically different results, so we will stick to only binomial/logistic regression for simplicity.
for(i in 1:ncol(htl))
{
if(length(levels(factor(htl[,i]))) > 2)
{
htl[,i]<-as.numeric(as.character(htl[,i]))
}
else
{
htl[,i]<-as.factor(htl[,i])
}
}
head(htl)
## Gender Age BMI HbA1c T2D SI Purity DIC Meth_Batch RNA_Batch
## ID1 Male 68 24.7 5.8 0 9.6 75 5 1 old_chem
## ID2 Male 55 24.7 NA 0 10.2 90 8 21 old_chem
## ID3 Female 45 23.9 5.4 0 3.6 70 3 1 old_chem
## ID4 Female 61 27.7 NA 0 14.9 80 5 21 old_chem
## ID5 Female 69 17.6 NA 0 10.9 37 2 21 old_chem
## ID6 Male 67 28.4 5.5 0 1.9 95 2 1 old_chem
## Random
## ID1 5.117994
## ID2 4.859812
## ID3 5.598422
## ID4 5.231109
## ID5 4.442651
## ID6 5.624018
Now we are going to select the 110 individuals with overlapping OMICs and keep only those individuals for further downstream analysis:
htl<-htl[match(selected_ind,rownames(htl)),]
head(htl)
## Gender Age BMI HbA1c T2D SI Purity DIC Meth_Batch RNA_Batch
## ID1 Male 68 24.7 5.8 0 9.6 75 5 1 old_chem
## ID3 Female 45 23.9 5.4 0 3.6 70 3 1 old_chem
## ID4 Female 61 27.7 NA 0 14.9 80 5 21 old_chem
## ID6 Male 67 28.4 5.5 0 1.9 95 2 1 old_chem
## ID8 Male 51 27.0 4.3 0 37.3 75 2 14 old_chem
## ID10 Female 60 26.1 NA 0 5.6 75 2 21 old_chem
## Random
## ID1 5.117994
## ID3 5.598422
## ID4 5.231109
## ID6 5.624018
## ID8 4.369448
## ID10 3.610918
Now let us calculate pair-wise linear regression for the phenotypes and extract the adjusted R squared information which is equaivalent to the fraction of variation in the response variable explained by the predictor variable. In case the response variable is a factor, we will be using logistic regression (Generalized Linear Model, GLM, with family = “binomial”) and calculate the adjusted R suared as:
\[R^2_{adj} = 1 - \frac{\rm{Residual Deviance}}{\rm{Null Deviance}}\]
htl_adj_r_squared<-matrix(NA,ncol=ncol(htl),nrow=ncol(htl))
for(i in 1:ncol(htl))
{
print(i)
for(j in 1:ncol(htl))
{
if(typeof(htl[,i])=="double" & (typeof(htl[,j])=="double" | typeof(htl[,j])=="integer"))
{
model<-suppressWarnings(lm(htl[,i]~htl[,j]))
htl_adj_r_squared[j,i]<-suppressWarnings(summary(model)$adj.r.squared)
}
if(typeof(htl[,i])=="integer" & typeof(htl[,j])=="double")
{
model<-suppressWarnings(lm(htl[,j]~htl[,i]))
htl_adj_r_squared[j,i]<-suppressWarnings(summary(model)$adj.r.squared)
}
if(typeof(htl[,i])=="integer" & typeof(htl[,j])=="integer")
{
model<-suppressWarnings(glm(htl[,i]~htl[,j],family="binomial"))
htl_adj_r_squared[j,i]<-1-model$deviance/model$null.deviance
}
}
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
htl_adj_r_squared<-as.data.frame(htl_adj_r_squared)
colnames(htl_adj_r_squared)<-colnames(htl)
rownames(htl_adj_r_squared)<-colnames(htl)
htl_adj_r_squared[htl_adj_r_squared<0]<-0
htl_adj_r_squared
## Gender Age BMI HbA1c T2D
## Gender 1.0000000000 0.000000000 0.000000000 0.00000000 6.709567e-05
## Age 0.0000000000 1.000000000 0.000000000 0.00000000 0.000000e+00
## BMI 0.0000000000 0.000000000 1.000000000 0.02101267 3.241738e-02
## HbA1c 0.0000000000 0.000000000 0.021012675 1.00000000 5.635442e-01
## T2D 0.0000608413 0.000000000 0.032417382 0.56354416 1.000000e+00
## SI 0.0047175889 0.000000000 0.000000000 0.01691996 4.099784e-03
## Purity 0.0000000000 0.000000000 0.000000000 0.00000000 0.000000e+00
## DIC 0.0000000000 0.011359304 0.000000000 0.03419322 7.579456e-02
## Meth_Batch 0.0000000000 0.005251804 0.000000000 0.00000000 0.000000e+00
## RNA_Batch 0.0022531214 0.000000000 0.000000000 0.00000000 2.053184e-02
## Random 0.0000000000 0.000000000 0.001175757 0.00000000 7.923939e-03
## SI Purity DIC Meth_Batch RNA_Batch
## Gender 0.004717589 0.000000000 0.000000000 0.000000000 0.00262000
## Age 0.000000000 0.000000000 0.011359304 0.005251804 0.00000000
## BMI 0.000000000 0.000000000 0.000000000 0.000000000 0.00000000
## HbA1c 0.016919962 0.000000000 0.034193217 0.000000000 0.00000000
## T2D 0.004099784 0.000000000 0.075794563 0.000000000 0.01798873
## SI 1.000000000 0.000000000 0.000000000 0.017228281 0.08353539
## Purity 0.000000000 1.000000000 0.067372444 0.004887090 0.04538269
## DIC 0.000000000 0.067372444 1.000000000 0.004515378 0.00000000
## Meth_Batch 0.017228281 0.004887090 0.004515378 1.000000000 0.05496894
## RNA_Batch 0.083535390 0.045382693 0.000000000 0.054968938 1.00000000
## Random 0.011848277 0.002161258 0.000000000 0.000000000 0.02205427
## Random
## Gender 0.000000000
## Age 0.000000000
## BMI 0.001175757
## HbA1c 0.000000000
## T2D 0.007923939
## SI 0.011848277
## Purity 0.002161258
## DIC 0.000000000
## Meth_Batch 0.000000000
## RNA_Batch 0.022054271
## Random 1.000000000
Now we can plot the heatmap of the adjusted R squared statistics and check how phenotypes and batches are related to each other:
library("pheatmap")
pheatmap(htl_adj_r_squared, display_numbers=TRUE, fontsize=12, main="Human Pancreatic Islets Phenotypes: Adjusted R^2 of Association")
We can see that HbA1c and T2D are very strongly connected, they should since the T2D diagnostics relies on blood glucose level measurements. They also weakly cluster together with BMI implying relation between those three phenotypes, which also makes a lot of sense. Despite Age and Gender are believed to be a risk factor for T2D, we do not observe a reliable association between those three phenotypes in our sample. Meth_Batch and RNA_Batch are moderately connected but happily do not seem to influence any of the important phenotypes, perhaps only the Stimulatory Index (SI). There is one more strange/unexpected observations following from the heatmap DIC seem to be correlated with the T2D status.
Let us read the Methylation Array data and have a look:
library("data.table")
## data.table 1.12.2 using 2 threads (see ?getDTthreads). Latest news: r-datatable.com
met<-suppressWarnings(as.data.frame(fread("methylation_filtered_normalized_no_combat_plus_11_samples_updated_2019_09_19.txt")))
rownames(met)<-met$V1
met$V1<-NULL
met<-subset(met,select=selected_ind)
met<-as.data.frame(t(met))
met[1:6,1:6]
## cg00000029 cg00000103 cg00000109 cg00000155 cg00000158 cg00000165
## ID1 -2.2563769 -0.5556851 3.716607 5.055956 0.8063106 -3.550196
## ID3 -2.0256036 -0.3609540 4.643902 5.147948 0.1622386 -3.680834
## ID4 -2.1724137 -0.2045822 4.312885 4.426633 -1.4258091 -3.465771
## ID6 -0.9692798 -0.9664077 4.118167 5.605515 -1.5650929 -3.894324
## ID8 -1.7859634 -0.7924335 4.690729 5.726665 -1.4988594 -3.344524
## ID10 -1.9853556 -0.7016344 3.623634 5.908261 0.2979108 -3.500956
dim(met)
## [1] 110 816790
Now the methylation data is coupled with the T2D status and ready for performing any statistical analysis such as PCA and PLS-DA.
Let us now check how batch-effects influence the Methylation EPIC Array data. The EPIC array was done using 21 batches. First of all, let us check if we observe clustering by batches on the PCA plot, i.e. if we observe genome-wide batch-effects:
library("mixOmics")
## Loading required package: MASS
## Loading required package: lattice
## Loading required package: ggplot2
##
## Loaded mixOmics 6.8.5
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us: citation('mixOmics')
X<-met
X<-log10(as.matrix(X)+10)
hist(as.matrix(met),breaks=100, main="Histogram of methylation values before log-transform")
hist(X,breaks=100, main="Histogram of methylation values after log-transform")
X[1:6,1:6]
## cg00000029 cg00000103 cg00000109 cg00000155 cg00000158 cg00000165
## ID1 0.8889442 0.9751705 1.137247 1.177708 1.0336774 0.8095465
## ID3 0.9016978 0.9840341 1.165657 1.180354 1.0069894 0.8006597
## ID4 0.8936279 0.9910230 1.155727 1.159165 0.9331931 0.8151944
## ID6 0.9557224 0.9558605 1.149778 1.193278 0.9260803 0.7857338
## ID8 0.9145566 0.9641449 1.167043 1.196637 0.9294772 0.8231791
## ID10 0.9038843 0.9684066 1.134293 1.201623 1.0127491 0.8128495
dim(X)
## [1] 110 816790
pca.met<-pca(X,ncomp=10,logratio='none',center=TRUE,scale=TRUE)
pca.met
## Eigenvalues for the first 10 principal components, see object$sdev^2:
## PC1 PC2 PC3 PC4 PC5 PC6
## 115694.199 72285.627 37860.830 29604.171 20621.344 17481.319
## PC7 PC8 PC9 PC10
## 14243.575 12238.119 10310.112 9935.456
##
## Proportion of explained variance for the first 10 principal components, see object$explained_variance:
## PC1 PC2 PC3 PC4 PC5 PC6
## 0.14164497 0.08849965 0.04635320 0.03624453 0.02524681 0.02140246
## PC7 PC8 PC9 PC10
## 0.01743848 0.01498319 0.01262272 0.01216403
##
## Cumulative proportion explained variance for the first 10 principal components, see object$cum.var:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 0.1416450 0.2301446 0.2764978 0.3127424 0.3379892 0.3593916 0.3768301
## PC8 PC9 PC10
## 0.3918133 0.4044360 0.4166000
##
## Other available components:
## --------------------
## loading vectors: see object$rotation
plot(pca.met,ylim=c(0,0.15))
plotIndiv(pca.met,comp=c(1,2),ind.names=TRUE,group=htl$Meth_Batch,ellipse=FALSE,legend=TRUE,title="PCA EPIC: Batch Effect")
plotIndiv(pca.met,comp=c(1,2),ind.names=TRUE,group=htl$Gender,ellipse=FALSE,legend=TRUE,title="PCA EPIC: Sex Effect")
plotIndiv(pca.met,comp=c(1,2),ind.names=TRUE,group=htl$T2D,ellipse=FALSE,legend=TRUE,title="PCA EPIC: T2D Effect")
plotIndiv(pca.met,comp=c(1,2),ind.names=TRUE,group=htl$DIC,ellipse=FALSE,legend=TRUE,title="PCA EPIC: DIC Effect")
We do not seem to see obvious clustering by batch, T2D or DIC on genome-wide level. However this does not mean that each individual CpG is not affected by batch-effects. A very obvious clustering comes from Males vs. Females samples that was present previously has disappeared completely due to the log-transform that made the data more normally-distributed and therefore reduced the batch.
To further quantify genome-wide batch-effetcs let us display how much of variation in each principal component is explained by the batch variables:
pc_adj_r_squared<-matrix(NA,ncol=dim(pca.met$x)[2],nrow=dim(htl)[2])
for(i in 1:dim(pca.met$x)[2])
{
print(i)
for(j in 1:dim(htl)[2])
{
pc_adj_r_squared[j,i]<-summary(lm(pca.met$x[,i]~htl[,j]))$adj.r.squared
}
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
pc_adj_r_squared<-as.data.frame(pc_adj_r_squared)
colnames(pc_adj_r_squared)<-colnames(pca.met$x)
rownames(pc_adj_r_squared)<-colnames(htl)
pc_adj_r_squared[pc_adj_r_squared<0]<-0
pc_adj_r_squared
## PC1 PC2 PC3 PC4 PC5
## Gender 0.000000000 0.002883034 0.2040337705 0.168558821 0.0773476528
## Age 0.005556264 0.000000000 0.0000000000 0.000000000 0.0361323499
## BMI 0.000000000 0.000000000 0.0000000000 0.050597761 0.0000000000
## HbA1c 0.017450690 0.033859775 0.0000000000 0.000000000 0.0281105543
## T2D 0.009839612 0.018043349 0.0030073130 0.004095874 0.0791543562
## SI 0.000000000 0.000000000 0.0008244417 0.000000000 0.0000000000
## Purity 0.228131119 0.008502569 0.0000000000 0.000000000 0.0000000000
## DIC 0.009739542 0.005212007 0.0174649418 0.000000000 0.0007758333
## Meth_Batch 0.024256207 0.044989391 0.0625984283 0.009087351 0.0421525301
## RNA_Batch 0.000000000 0.177450392 0.0033032143 0.000000000 0.0155197502
## Random 0.000000000 0.000000000 0.0000000000 0.000000000 0.0099907849
## PC6 PC7 PC8 PC9 PC10
## Gender 0.335062644 0.1155525363 0.003323964 0.000000000 0.003580318
## Age 0.000000000 0.0000000000 0.000000000 0.000000000 0.000000000
## BMI 0.010516216 0.0000000000 0.007627480 0.000000000 0.042050691
## HbA1c 0.003439309 0.0000000000 0.151022586 0.112110400 0.064940652
## T2D 0.000000000 0.0000000000 0.167839171 0.075449380 0.079391925
## SI 0.000000000 0.0086608167 0.000000000 0.000000000 0.096121426
## Purity 0.002903170 0.0000000000 0.000000000 0.000000000 0.000000000
## DIC 0.000000000 0.0005782383 0.000000000 0.004504305 0.000000000
## Meth_Batch 0.000000000 0.0386075768 0.135369702 0.000000000 0.057472318
## RNA_Batch 0.000000000 0.0000000000 0.000000000 0.000000000 0.003183126
## Random 0.000000000 0.0000000000 0.000000000 0.004024466 0.000000000
library("pheatmap")
pheatmap(pc_adj_r_squared, display_numbers=TRUE, fontsize=12, cluster_cols=FALSE, main="Human Islets Methylation Array: Adj R^2 of Association between PCs and Phenotypes")
We conclude that PC1 is mostly due to Purity of the Human Pancreatic Islets, while PC2 is mostly due to RNAseq batch which does not make sense at the first glance but my interpretation is that it is somehow confounded by the Methylation batch. The Methylation batch contributes almost only to the PC8, which is good. The PC3,4,5,6,7 the Sex as a main contribution. Overall Sexx seems to be the strongest variable associated with almost all PCs but not PC1 and PC2. The phenotype of interest, T2D and HbA1c are included into PC 5 and especially 8. Let us out of curiosity display top Loadings for PC 5,8, we will do a proper feature selection with PLS-DA and LASSO later, but now we can still get a feeling which CpG sites seem to be correlated with diabetes phenotypes:
plotLoadings(pca.met,comp=5,method='median',contrib='max',ndisplay=20)
plotLoadings(pca.met,comp=8,method='median',contrib='max',ndisplay=20)
Now we will perform PLS-DA analysis:
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3617838 193.3 6997732 373.8 6997732 373.8
## Vcells 299732341 2286.8 614933300 4691.6 614933300 4691.6
library("mixOmics")
Y<-as.factor(as.character(htl$T2D))
summary(Y)
## 0 1
## 78 32
We have approximetaly 29% of T2D individuals which is not crazy unbalanced, so there is a hope that the model does not learn only the majority class. Important to mention that a very naive / stupid classifier that predicts Non-T2D for any given individual should thus achieve 71% accuracy of classification. So our PLS-DA model should be better than the naive classifier and outperform the lower threshold of 71%.
my_folds=2
my_nrepeat=5
my_progressBar=FALSE
my_cpus=1
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3618107 193.3 6997732 373.8 6997732 373.8
## Vcells 299733447 2286.8 614933300 4691.6 614933300 4691.6
met_plsda.perf<-plsda(X, Y, ncomp=10)
ptm<-proc.time()
perf.plsda<-perf(met_plsda.perf, validation='Mfold', folds=my_folds, progressBar=my_progressBar, nrepeat=my_nrepeat)
proc.time()-ptm
## user system elapsed
## 754.194 112.321 866.600
perf.plsda
##
## Call:
## perf.mixo_plsda(object = met_plsda.perf, validation = "Mfold", folds = my_folds, nrepeat = my_nrepeat, progressBar = my_progressBar)
##
## Main numerical outputs:
## --------------------
## Error rate (overall or BER) for each component and for each distance: see object$error.rate
## Error rate per class, for each component and for each distance: see object$error.rate.class
## Prediction values for each component: see object$predict
## Classification of each sample, for each component and for each distance: see object$class
## AUC values: see object$auc if auc = TRUE
##
## Visualisation Functions:
## --------------------
## plot
head(perf.plsda$error.rate)
## $overall
## max.dist centroids.dist mahalanobis.dist
## comp1 0.2872727 0.3218182 0.3218182
## comp2 0.2200000 0.2290909 0.2163636
## comp3 0.2236364 0.2309091 0.2309091
## comp4 0.1872727 0.2127273 0.1890909
## comp5 0.2036364 0.2127273 0.2036364
## comp6 0.1963636 0.2109091 0.1963636
## comp7 0.2200000 0.2327273 0.2200000
## comp8 0.1981818 0.2181818 0.1981818
## comp9 0.2163636 0.2363636 0.2163636
## comp10 0.2072727 0.2400000 0.2072727
##
## $BER
## max.dist centroids.dist mahalanobis.dist
## comp1 0.3960737 0.3798878 0.3798878
## comp2 0.3283654 0.3218750 0.3184295
## comp3 0.3456731 0.3489583 0.3452724
## comp4 0.2905449 0.3195513 0.2918269
## comp5 0.3241987 0.3269231 0.3241987
## comp6 0.3245994 0.3274840 0.3245994
## comp7 0.3412660 0.3520833 0.3412660
## comp8 0.3166667 0.3326122 0.3166667
## comp9 0.3368590 0.3546474 0.3368590
## comp10 0.3286058 0.3479968 0.3286058
head(perf.plsda$error.rate.class)
## $max.dist
## comp1 comp2 comp3 comp4 comp5 comp6
## 0 0.1358974 0.06923077 0.05384615 0.04358974 0.03589744 0.01794872
## 1 0.6562500 0.58750000 0.63750000 0.53750000 0.61250000 0.63125000
## comp7 comp8 comp9 comp10
## 0 0.05128205 0.03333333 0.04871795 0.03846154
## 1 0.63125000 0.60000000 0.62500000 0.61875000
##
## $centroids.dist
## comp1 comp2 comp3 comp4 comp5 comp6 comp7
## 0 0.2410256 0.10000 0.06666667 0.06410256 0.05384615 0.04871795 0.06666667
## 1 0.5187500 0.54375 0.63125000 0.57500000 0.60000000 0.60625000 0.63750000
## comp8 comp9 comp10
## 0 0.05897436 0.07179487 0.08974359
## 1 0.60625000 0.63750000 0.60625000
##
## $mahalanobis.dist
## comp1 comp2 comp3 comp4 comp5 comp6
## 0 0.2410256 0.07435897 0.07179487 0.04615385 0.03589744 0.01794872
## 1 0.5187500 0.56250000 0.61875000 0.53750000 0.61250000 0.63125000
## comp7 comp8 comp9 comp10
## 0 0.05128205 0.03333333 0.04871795 0.03846154
## 1 0.63125000 0.60000000 0.62500000 0.61875000
plot(perf.plsda,overlay='dist',sd=TRUE)
BER Mahalanobis and max distances seem to reach their minimum at ncomp=4, so we will use 4 PCs for further PLS-DA analysis.
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3666808 195.9 6997732 373.8 6997732 373.8
## Vcells 416669234 3179.0 1271044136 9697.3 1271044136 9697.3
met_plsda<-plsda(X, Y, ncomp=4)
met_plsda$explained_variance
## $X
## comp 1 comp 2 comp 3 comp 4
## 0.09832754 0.06757987 0.07539608 0.03425044
##
## $Y
## comp 1 comp 2 comp 3 comp 4
## 1.0000000 0.6569264 0.2839192 0.1493013
background = background.predict(met_plsda, comp.predicted=2, dist = "mahalanobis.dist")
plotIndiv(met_plsda, comp=c(1,2), group=Y, ind.names=TRUE, ellipse=FALSE, background=background, legend=TRUE, title="Human Pancreatic Islets: PLS-DA of Methylation EPIC Array")
Interestingly, the samples 220 and 248, that are not T2Ds but end up on the T2D side of the decision boundary, seem to have abnormal BMI values so very likely they were non-diagnosed type 2 diabetics.
htl[rownames(htl)=="ID220" | rownames(htl)=="ID248",]
## Gender Age BMI HbA1c T2D SI Purity DIC Meth_Batch RNA_Batch
## ID220 Male 66 31.1 NA 0 2.1 96 7 21 <NA>
## ID248 Male 43 30.9 5.8 0 14.9 94 2 19 <NA>
## Random
## ID220 4.406719
## ID248 5.389072
plotLoadings(met_plsda, comp=1, method='median', contrib='max', ndisplay=20)
plotLoadings(met_plsda, comp=2, method='median', contrib='max', ndisplay=20)
#plotVar(met_plsda, comp=c(1,2), var.names=list(rownames(met)), cex=3)
#cim(met_plsda, row.sideColors=color.mixo(Y), margins=c(8,4))
Many of the CpG sites included to the PLS-DA analysis are actually non-informative and bring noise to the analysis. Therefore one needs to apply sparse cleaning algorithm to select a subset of genes that best discriminate between diabetics and non diabetics. We will use LASSO for selection of most informative CpG sites. The tuning procedure below is performed on one component at a time and selects an optimal number of genes that provide lowest error rate. Since from the previous analysis we concluded that the minimal error rate is achieved on the first 4 principal components, we will use ncomp=4 in the sPLS-DA analysis.
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3678054 196.5 6997732 373.8 6997732 373.8
## Vcells 518795113 3958.1 1271044136 9697.3 1271044136 9697.3
list.keepX<-c(1:10,seq(20,100,10))
ptm<-proc.time()
tune.splsda.islets<-tune.splsda(X,Y,ncomp=4,validation='Mfold',folds=my_folds,progressBar=my_progressBar,dist='mahalanobis.dist',test.keepX=list.keepX,nrepeat=my_nrepeat)
proc.time()-ptm
## user system elapsed
## 3061.378 22.094 3083.747
head(tune.splsda.islets$error.rate)
## comp1 comp2 comp3 comp4
## 1 0.2661859 0.2370192 0.2184295 0.2171474
## 2 0.2245192 0.2405449 0.2208333 0.2202724
## 3 0.2217949 0.2304487 0.2158654 0.2215545
## 4 0.2179487 0.2234776 0.2158654 0.2259615
## 5 0.2223558 0.2203526 0.2145833 0.2246795
## 6 0.2141026 0.2089744 0.2195513 0.2265224
tune.splsda.islets$choice.keepX
## comp1 comp2 comp3 comp4
## 10 90 5 1
plot(tune.splsda.islets,optimal=TRUE,sd=TRUE)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3688511 197.0 6997732 373.8 6997732 373.8
## Vcells 518808743 3958.2 1460209419 11140.6 1460209419 11140.6
select.keepX<-as.numeric(tune.splsda.islets$choice.keepX)
splsda.islets<-splsda(X, Y, ncomp=4, keepX=select.keepX)
background = background.predict(splsda.islets, comp.predicted=2, dist = "mahalanobis.dist")
plotIndiv(splsda.islets,comp=c(1,2),group=Y,ind.names=TRUE,ellipse=FALSE,background=background,legend=TRUE,title="Human Pancreatic Islets: PLS-DA of Methylation EPIC Array")
plotVar(splsda.islets,comp=c(1,2),var.names=list(colnames(met)),cex=3)
head(sort(abs(splsda.islets$loadings$X[,"comp1"]),decreasing=TRUE),10)
## cg02988288 cg14527110 cg07175985 cg02966936 cg12220370 cg13566279
## 0.65158204 0.51861603 0.30562253 0.26107460 0.22463620 0.21787376
## cg14490520 cg06184251 cg03770217 cg17826980
## 0.17448439 0.10462873 0.07089807 0.02375425
head(sort(abs(splsda.islets$loadings$X[,"comp2"]),decreasing=TRUE),10)
## cg17083819 cg01475538 cg19102271 cg06520307 cg04945076 cg09781705
## 0.3415356 0.2537688 0.2230905 0.2084387 0.1909318 0.1809503
## cg05199346 cg14923547 cg11511842 cg18506744
## 0.1792540 0.1714665 0.1706109 0.1692890
plotLoadings(splsda.islets,comp=1,method='median',contrib='max',ndisplay=20)
## 'ndisplay' value is larger than the number of selected variables! It has been reseted to 10 for block X
plotLoadings(splsda.islets,comp=2,method='median',contrib='max',ndisplay=20)
cim(splsda.islets,row.sideColors=color.mixo(Y),margins=c(8,4))
Finally we will randomly split the methylation data set into training (80% of samples) and validation (20% of samples) sets. This is needed to validate predictions of sPLS-DA trained on the training data set.
We are going to perform again the tuning of the sPLS-DA model on the training data set. From the previous analysis we know that first 3 PCs minimize the mahalanobis.dist balanced error rate. But here for simplicity since we have only a few samples let us use ncomp=2 and re-tune the optimal numbers of genes although they should be quite similar to the ones obtained in the previous section as the training set comprises 80% of the full data set. For this purpose, we will choose again a small step of the gene grid list.keepX. Now we will apply the tuned and trained model to the validation data set and generate predictions of T2D status and compute the accuracy of the prediction.
The accuracy of T2D vs NonT2D classification is very high, much higher than the 72% accuracy of the naive model. Now we will build confidence intervals by splitting the data set into train and test multiple times and running the PLS-DA classifier for every split.
gc()
comp1_acc<-vector()
comp2_acc<-vector()
for(k in 1:30)
{
print(paste0("Working with split No.", k))
gc()
set.seed(k+100)
test_samples<-rownames(met)[sample(1:length(rownames(met)),round(length(rownames(met))*0.2))]
train_samples<-rownames(met)[!rownames(met)%in%test_samples]
X.train<-X[match(train_samples,rownames(X)),]
X.test<-X[match(test_samples,rownames(X)),]
Y.train<-as.factor(as.character(htl[match(train_samples,rownames(htl)),]$T2D))
Y.test<-as.factor(as.character(htl[match(test_samples,rownames(htl)),]$T2D))
list.keepX<-c(1:10,seq(20,100,10))
tune.splsda.train.met<-tune.splsda(X.train, Y.train, ncomp=4, validation='Mfold', folds=my_folds,
progressBar=my_progressBar, dist='mahalanobis.dist', logratio="none",
test.keepX=list.keepX, nrepeat=my_nrepeat)
splsda.met.train<-splsda(X.train, Y.train, logratio='none', ncomp=4,
keepX=as.numeric(tune.splsda.train.met$choice.keepX))
splsda.met.predict<-predict(splsda.met.train, X.test, dist='mahalanobis.dist')
print(table(splsda.met.predict$class$mahalanobis.dist[,1],Y.test))
acc1<-round((sum(diag(table(splsda.met.predict$class$mahalanobis.dist[,1],Y.test)))/sum(table(splsda.met.predict$class$mahalanobis.dist[,1],Y.test)))*100)
print(paste0("Classification Accuracy from PLS Component 1: ", acc1))
print(table(splsda.met.predict$class$mahalanobis.dist[,2],Y.test))
acc2<-round((sum(diag(table(splsda.met.predict$class$mahalanobis.dist[,2],Y.test)))/sum(table(splsda.met.predict$class$mahalanobis.dist[,2],Y.test)))*100)
print(paste0("Classification Accuracy from PLS Component 2: ", acc2))
comp1_acc<-append(comp1_acc,acc1)
comp2_acc<-append(comp2_acc,acc2)
print("***********************************************************")
}
write.table(comp1_acc,file="Comp1_PLS_Meth_Acc.txt",col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
write.table(comp2_acc,file="Comp2_PLS_Meth_Acc.txt",col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3736614 199.6 6997732 373.8 6997732 373.8
## Vcells 621011846 4738.0 1460209419 11140.6 1460209419 11140.6
comp1_acc_arch<-as.numeric(scan("Comp1_PLS_Meth_Acc.txt",what="character"))
comp2_acc_arch<-as.numeric(scan("Comp2_PLS_Meth_Acc.txt",what="character"))
#comp1_acc_arch<-c(comp1_acc,comp1_acc_arch)
#comp2_acc_arch<-c(comp2_acc,comp2_acc_arch)
hist(comp1_acc_arch,breaks=20,xlab="ACCURACY",main="Accuracy T2D Prediction from Methylation: PLS1",col="darkgreen")
mtext(paste0("Accuracy = ",mean(comp1_acc_arch)," +/- ",2*sd(comp1_acc_arch)))
hist(comp2_acc_arch,breaks=20,xlab="ACCURACY",main="Accuracy T2D Prediction from Methylation: PLS2",col="darkgreen")
mtext(paste0("Accuracy = ",mean(comp2_acc_arch)," +/- ",2*sd(comp2_acc_arch)))
We can compute how significantly different is the accuracy of the PLS-DA model compared to the naive 71% accuracy:
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3736678 199.6 6997732 373.8 6997732 373.8
## Vcells 621011419 4738.0 1460209419 11140.6 1460209419 11140.6
sum(comp1_acc_arch<=71)/length(comp1_acc_arch)
## [1] 0
sum(comp2_acc_arch<=71)/length(comp2_acc_arch)
## [1] 0
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2918858 155.9 6997732 373.8 6997732 373.8
## Vcells 211758027 1615.6 1168167536 8912.5 1460209419 11140.6
Finally here is the details on the system on which this document was compiled:
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=sv_SE.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=sv_SE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=sv_SE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=sv_SE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] mixOmics_6.8.5 ggplot2_3.2.1 lattice_0.20-38 MASS_7.3-51.4
## [5] data.table_1.12.2 pheatmap_1.0.12 knitr_1.24.5 rmarkdown_1.15.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.2 RSpectra_0.15-0 plyr_1.8.4
## [4] compiler_3.6.1 pillar_1.4.2 RColorBrewer_1.1-2
## [7] tools_3.6.1 digest_0.6.20 evaluate_0.14
## [10] tibble_2.1.3 gtable_0.3.0 pkgconfig_2.0.2
## [13] rlang_0.4.0 Matrix_1.2-17 igraph_1.2.4.1
## [16] parallel_3.6.1 yaml_2.2.0 xfun_0.8
## [19] gridExtra_2.3 withr_2.1.2 stringr_1.4.0
## [22] dplyr_0.8.3 grid_3.6.1 tidyselect_0.2.5
## [25] ellipse_0.4.1 glue_1.3.1 R6_2.4.0
## [28] rARPACK_0.11-0 reshape2_1.4.3 tidyr_0.8.3
## [31] purrr_0.3.2 corpcor_1.6.9 magrittr_1.5
## [34] matrixStats_0.54.0 scales_1.0.0 htmltools_0.3.6
## [37] assertthat_0.2.1 colorspace_1.4-1 labeling_0.3
## [40] stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0
## [43] crayon_1.3.4